Due: Thursday 05 Feb 3:30 PM

Grading criteria: Answer all the questions completely (including all the check boxes). On time submission.

Code a classification prediction task

This example is a species distribution model. The goal of this assignment is to find the best performing KNN model to predict the presence or absence of the Willow Tit, a bird native to Switzerland, using elevation as a predictor.

library(ggplot2)
library(dplyr)

# Load willowtit data (setting presence =1 and absence =0)
willowtit <- read.csv("data/wtmatrix.csv") |> 
    mutate(status = ifelse(y.1 == 1, "present", "absent")) |> 
    select(status, elev)

# Check the data
head(willowtit)
##   status elev
## 1 absent  420
## 2 absent  450
## 3 absent 1050
## 4 absent 1110
## 5 absent  510
## 6 absent  630
summary(willowtit$elev)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250     570    1070    1183    1820    2750
# KNN classification function from Brett's example code: (here just 1 predictor though - elevation - instead of 2 - orange and blue)
knn_classify2 <- function(x, y, x_new, k) {
    category <- unique(y) 
    y_int <- ifelse(y == category[1], 1, 0) 
    nx <- nrow(x)
    n <- nrow(x_new)
    c <- ncol(x_new)
    p_cat1 <- rep(NA, n)
    for ( i in 1:n ) {
        x_new_m <- matrix(x_new[i,], nx, c, byrow=TRUE)
        d <- sqrt(rowSums((x - x_new_m) ^ 2))
        y_sort <- y_int[order(d, sample(1:length(d)))]
        p_cat1[i] <- mean(y_sort[1:k])
    }
    y_pred <- ifelse(p_cat1 > 0.5, category[1], category[2])
    rnd_category <- sample(category, n, replace=TRUE)
    tol <- 1 / (k * 10)
    y_pred <- ifelse(abs(p_cat1 - 0.5) < tol, rnd_category, y_pred)
    return(y_pred)
}

# Create grid of evenly spaced elevation values to make predictions at (whether we expect to find the birds) ranging from highest to lowest elev. in our dataset
elev_grid <- data.frame(
    elev = seq(min(willowtit$elev), max(willowtit$elev), length.out = 200) #grid ranging from lowest to highest elevation values, randomly picked 200
)

Q1

# Choose k values to explore
k_values <- c(1, 3, 5, 10, 25, 50, 100)

# Get predictions for each k
predictions_list <- list() #create empty list to store values 

for (k in k_values) { #loop through each k value 
    pred <- knn_classify2(
        x = as.matrix(willowtit["elev"]), #build training data from x (elev) values
        y = willowtit$status, #present or absent for each of these 237 values 
        x_new = as.matrix(elev_grid), #new points where I want predictions (I picked 200)
        k = k #current k value from the loop (number of neighbors to consider)
    )
    
    #store datafram results in the empty list, by name k: 
    predictions_list[[as.character(k)]] <- data.frame( 
        elev = elev_grid$elev,
        predicted_status = pred,
        k_knn = k
    )
}

all_predictions <- bind_rows(predictions_list) #binding rows basically stacks the dataframes being produced
head(all_predictions) #print first 6 rows of predictions 
##       elev predicted_status k_knn
## 1 250.0000           absent     1
## 2 262.5628           absent     1
## 3 275.1256           absent     1
## 4 287.6884           absent     1
## 5 300.2513           absent     1
## 6 312.8141           absent     1
# Now visualize these predictions: 
# x axis is elev, y is present/absent, make a plot for every k value 
ggplot(all_predictions, aes(x = elev, y = predicted_status)) +
    geom_jitter(height = 0.1, alpha = 0.4, size = 1.5, width = 0) + # adds a little bit of vertical movement to see individual points better
    facet_wrap(~ k_knn, ncol = 2) +
    labs(
        title = "KNN Predictions: Willow Tit Presence vs Elevation",
        x = "Elevation (m)",
        y = "Predicted Status"
    ) +
    theme_minimal()

    # shows that willowtits are present at mid elevations mostly
    # absence at all elevations for lower k values, predictions appear throughout the range indicating high variance/noise
    # less absence at mid elevations for higher k values, clearer pattern emerges with mid elev showing consistent present predictions and low and high elevations showing absence. Smoothing effect of larger k 
# play with other plot formats, i think color helps with visualization
ggplot(all_predictions, aes(x = elev, y = predicted_status, color = predicted_status)) +
    geom_jitter(height = 0.1, alpha = 0.6, size = 2, width = 0) +
    facet_wrap(~ k_knn, ncol = 2) +
    scale_color_manual(values = c("absent" = "coral", "present" = "steelblue")) +
    labs(
        title = "KNN Predictions: Willow Tit Presence vs Elevation",
        x = "Elevation (m)",
        y = "Predicted Status",
        color = "Prediction"
    ) +
    theme_minimal() +
    theme(legend.position = "bottom")

Q2

Q2. Make a new function, called knn_classify2_probs, by modifying the knn_classify2 function so that instead of returning a character vector of the predictions, it returns a numeric vector of the probabilities. To do this, you should work through the existing function line-by-line to understand its flow. Hint: the solution is to remove some lines of code. Also, the KNN function predicts the probability of cat1, which in this case could be absence, so be sure to include code that returns always (in the general case, not just this dataset) the probability of presence.

# updating KNN function so that it returns integers not categories
knn_classify2_probs <- function(x, y, x_new, k) {
    category <- unique(y)  # Get the two category names
    y_int <- ifelse(y == category[1], 1, 0)  # Convert categories to integers
    nx <- nrow(x)
    n <- nrow(x_new)
    c <- ncol(x_new)
    p_cat1 <- rep(NA, n)
    
    for ( i in 1:n ) {
        # Distance of x_new to other x (Euclidean)
        x_new_m <- matrix(x_new[i,], nx, c, byrow=TRUE)
        d <- sqrt(rowSums((x - x_new_m) ^ 2))
        # Sort y ascending by d; break ties randomly
        y_sort <- y_int[order(d, sample(1:length(d)))]
        # Mean of k nearest y data (gives probability of category 1)
        p_cat1[i] <- mean(y_sort[1:k])
    }
    
    # Return probability of "present" (not necessarily category[1])
    # Check which category is "present" and return appropriate probability
    if (category[1] == "present") {
        return(p_cat1)
    } else {
        return(1 - p_cat1)  # If category[1] is "absent", flip the probability (aka always return the probability of presence)
    }
}
# k values to explore
k_values_prob <- c(1, 5, 25, 100)

# Generate predictions for each k
prob_predictions_list <- list()

for (k in k_values_prob) {
    prob_pred <- knn_classify2_probs(
        x = as.matrix(willowtit["elev"]),
        y = willowtit$status,
        x_new = as.matrix(elev_grid),
        k = k
    )
    
    prob_predictions_list[[as.character(k)]] <- data.frame(
        elev = elev_grid$elev,
        prob_present = prob_pred,
        k_knn = k
    )
}

# Combine all predictions
prob_predictions <- bind_rows(prob_predictions_list)
ggplot(prob_predictions, aes(x = elev, y = prob_present)) +
    geom_point(alpha = 0.6, size = 2, color = "steelblue") +
    geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
    facet_wrap(~ k_knn, ncol = 2, 
               labeller = labeller(k_knn = function(x) paste("k =", x))) +
    labs(
        title = "Predicted probability of Willow Tit presence at given elevation",
        x = "Elevation (m)",
        y = "Probability of Presence"
    ) +
    ylim(0, 1) +
    theme_minimal()

How the shape changes with k_knn

With k = 1, doesnt show much, basically just 1s and 0s

With k = 5, more of a pattern but pretty jagged

With k = 25, the function is much smoother, showing a clear curve of presence at mid elevations and absence and low and high. The transition from low to high probability is gradual, with peak probabilities occurring around 1500-1800m elevation.

With k = 100, the function is very smooth but becomes almost linear after the presence peak at mid latitudes (ie tapers off around 0.5 probability in high elevations). This looks less accurate to me than K=25.

Bias-Variance Tradeoff

k = 1 exhibits extremely high variance and low bias (the predictions are all over the place, very jumpy). too sensitive to individual training points

k = 5 still shows high variance with a jagged pattern, though it’s starting to reveal the general trend of mid-elevation presence (just a bit noisy). The model remains quite flexible and sensitive to individual training points, making it prone to overfitting, but less so than k=1.

k = 25 appears to strike the best balance between bias and variance because it represents the prediction smoothly yet relatively closely. The smooth curve clearly captures the biological pattern: willow tits prefer mid-elevations (peak around 1500-1800m) with gradual transitions to absence at both low and high elevations. This model is flexible enough to capture the non-linear relationship without being overly sensitive to individual data points or noise.

k = 100 exhibits low variance but higher bias. While very smooth and stable, the function becomes too simple, particularly visible in how it tapers off linearly around 0.5 probability at high elevations rather than dropping to zero. This suggests underfitting - the model is too constrained and misses important features of the true pattern, making it less accurate than k=25.

Overall, k = 25 provides the optimal balance, capturing the key elevation-presence relationship smoothly while remaining true to the data pattern.

Q3

Q3. Tune the KNN model algorithm using the CV inference algorithm. Specifically, using the original knn_classify2 function, and based on the code in classification_knn.R, find the best predictive KNN model using 5, 10, and n-fold CV. Hint: be sure to search adequately across the possible range of k_knn. You want to see the error go down, and then go back up again. If the error is not obviously changing much, you’ve probably got more parameter space to explore.

# Load the random_partitions function (from Brett's code) ("source/random_partitions.R")
# Or just define it here:
random_partitions <- function(n, k) {
    # Creates k random partitions of numbers 1 to n
    partitions <- rep(1:k, length.out = n)
    sample(partitions)
}
# CV function for KNN on willow tit data - implement k-fold cross validation (CV) to estimate how well the knn model performs on new data 
# split data into k chunks (folds), train on k-1 chunks, test on remaining 1 chunk
# repeat k times so each chunk gets to be test once 
# avg error across all k tests 

#### Adapting Brett's orange/blue CV function for the willow tit: 
# willowtit: willow tit dataset (dataframe w 237 rows) 
# k_cv:      number of folds (scalar, integer)
# k_knn:     number of nearest neighbors for KNN to average (scalar, integer) - what we're tuning 
# return:    CV error as error rate (scalar, numeric)
cv_knn_willowtit <- function(willowtit, k_cv, k_knn) {
    willowtit$partition <- random_partitions(nrow(willowtit), k_cv) #randomly assign each row to a fold number 
    e <- rep(NA, k_cv) #create empty vector to store error for each fold 
    
    for ( i in 1:k_cv ) { #loop through each fold 
        test_data <- subset(willowtit, partition == i) #split data into test... 
        train_data <- subset(willowtit, partition != i) #...and training data 
        #ensures every row gets tested once 
        pred_status <- knn_classify2(
            x = as.matrix(train_data["elev"]),
            y = train_data$status,
            x_new = as.matrix(test_data["elev"]),
            k = k_knn
        )
        #^ yields vector of predictions for the test set (e.g. absent, present, absent ...)

        # Classification error rate (calculate error rate for this fold)
        e[i] <- mean(test_data$status != pred_status)
    }
    
    #avg error across all folds: 
    cv_error <- mean(e) #gives proportion of TRUE (error rate) 
    return(cv_error) #return final CV error estimate
}

# Test the function
cat("Testing CV function:\n")
## Testing CV function:
cv_knn_willowtit(willowtit, k_cv = 5, k_knn = 10)  #returned 0.3 (30% error rate (70% accuracy))
## [1] 0.2911348
cv_knn_willowtit(willowtit, k_cv = 10, k_knn = 10) #returned 0.29
## [1] 0.2452899
cv_knn_willowtit(willowtit, k_cv = nrow(willowtit), k_knn = 10)  #(237-fold) LOOCV returned 0.295 
## [1] 0.2489451
# Explore a range of k_knn values with different CV folds
# Start with a wide range to find where error changes
grid <- expand.grid(
    k_cv = c(5, 10, nrow(willowtit)),  # 5-fold, 10-fold, and LOOCV
    k_knn = 1:100  # Try k from 1 to 50 #now trying 1:100 to see more k values, see if errors increase again more clearly after low 
)

cv_error <- rep(NA, nrow(grid))

set.seed(123)  # For reproducibility
for ( i in 1:nrow(grid) ) {
    cv_error[i] <- cv_knn_willowtit(willowtit, grid$k_cv[i], grid$k_knn[i])
}

result_single <- cbind(grid, cv_error)

# look for the minimum error
result_single %>%
    group_by(k_cv) %>%
    slice_min(cv_error, n = 5)
## # A tibble: 16 × 3
## # Groups:   k_cv [3]
##     k_cv k_knn cv_error
##    <dbl> <int>    <dbl>
##  1     5    35    0.220
##  2     5    22    0.224
##  3     5    34    0.224
##  4     5    54    0.224
##  5     5     8    0.227
##  6    10    34    0.215
##  7    10    57    0.228
##  8    10    27    0.228
##  9    10    35    0.232
## 10    10    69    0.232
## 11    10    70    0.232
## 12   237    44    0.224
## 13   237    82    0.224
## 14   237    41    0.224
## 15   237    43    0.224
## 16   237    42    0.228
### part 1 showed 25 was our optimal k value of the 4 options, while this is showing that actually something in the 30s is best (around 34/35)
# Plot the result
ggplot(result_single, aes(x = k_knn, y = cv_error, color = factor(k_cv))) +
    geom_line(linewidth = 1) +
    geom_point() +
    labs(
        title = "Single Run: CV Error vs k (Number of Neighbors)",
        x = "k (number of nearest neighbors)",
        y = "CV Error Rate",
        color = "CV folds"
    ) +
    theme_minimal()

## from plot can interpret that lowest CV_error is around k=35 nearest neighbors
## error goes up to a max aroud k=11, then decreases to minimum around k=35, then increases again slightly (but mostly still lower than k<20). Kind of bottoms out so going to change 1:50 to 1:100 to try more k values 
## nice, now with k = 1:100 the errors very clearly increase at around k=75, min is still around k=35 
# Run the same thing again with a different seed to see how stable it is
# aka: If I partition the data differently, do I get similar results?
set.seed(456)  # Different seed
cv_error2 <- rep(NA, nrow(grid))

for ( i in 1:nrow(grid) ) {
    cv_error2[i] <- cv_knn_willowtit(willowtit, grid$k_cv[i], grid$k_knn[i])
}

result_single2 <- cbind(grid, cv_error = cv_error2)
# Compare both runs
comparison <- data.frame(
    k_cv = rep(grid$k_cv, 2),
    k_knn = rep(grid$k_knn, 2),
    cv_error = c(result_single$cv_error, result_single2$cv_error),
    run = rep(c("Run 1", "Run 2"), each = nrow(grid))
)

ggplot(comparison, aes(x = k_knn, y = cv_error, color = factor(k_cv), linetype = run)) +
    geom_line(linewidth = 1) +
    labs(
        title = "Stability Check: Two CV Runs with Different Random Seeds",
        x = "k (number of nearest neighbors)",
        y = "CV Error Rate",
        color = "CV folds",
        linetype = "Run"
    ) +
    theme_minimal()

# Hard to tell with so many lines overlapping! But I think it shows that the two lines (runs - dotted vs solid) dont differ much, seem to track 
# Now: run 100 replicate CV runs to get a stable, averaged estimate of the CV error

grid_stable <- expand.grid( #same grid as before 
    k_cv = c(5, 10, nrow(willowtit)),
    k_knn = 1:50
)

reps <- 100  # 100 replicate CV runs (repeat process 100 times)

cv_error_matrix <- matrix(NA, nrow = nrow(grid_stable), ncol = reps)
#Rows = each k_cv/k_knn combination (150 rows if k goes 1-50)
#Columns = each replicate run (100 columns)
#Then average across the 100 columns to get a stable estimate

set.seed(789) #random number generator 
cat("Running", reps, "replicate CV runs... this may take a few minutes\n")
## Running 100 replicate CV runs... this may take a few minutes
for ( j in 1:reps ) { #loop 100 times, test all 150 k combinations 
    for ( i in 1:nrow(grid_stable) ) {
        cv_error_matrix[i, j] <- cv_knn_willowtit(
            willowtit, 
            grid_stable$k_cv[i], 
            grid_stable$k_knn[i]
        )
    }
    if (j %% 10 == 0) cat("Completed", j, "reps\n")  # Progress monitor
}
## Completed 10 reps
## Completed 20 reps
## Completed 30 reps
## Completed 40 reps
## Completed 50 reps
## Completed 60 reps
## Completed 70 reps
## Completed 80 reps
## Completed 90 reps
## Completed 100 reps
result_stable <- cbind(grid_stable, cv_error_matrix)
result_stable$mean_cv <- rowMeans(result_stable[, -(1:2)])

cat("\n✓ Done!\n")
## 
## ✓ Done!
# Find the optimal k for each CV method
result_stable %>%
    select(k_cv, k_knn, mean_cv) %>%
    group_by(k_cv) %>%
    slice_min(mean_cv, n = 5)
## # A tibble: 15 × 3
## # Groups:   k_cv [3]
##     k_cv k_knn mean_cv
##    <dbl> <int>   <dbl>
##  1     5    33   0.238
##  2     5    47   0.238
##  3     5    46   0.238
##  4     5    38   0.239
##  5     5    44   0.239
##  6    10    37   0.235
##  7    10    39   0.235
##  8    10    40   0.236
##  9    10    38   0.236
## 10    10    41   0.238
## 11   237    41   0.224
## 12   237    43   0.224
## 13   237    42   0.226
## 14   237    44   0.227
## 15   237    40   0.229
# Plot mean CV error vs k
ggplot(result_stable, aes(x = k_knn, y = mean_cv, color = factor(k_cv))) +
    geom_line(linewidth = 1.2) +
    geom_point(size = 2) +
    labs(
        title = "Mean CV Error Across 100 Replicate Runs",
        x = "k (number of nearest neighbors)",
        y = "Mean CV Error Rate",
        color = "CV folds"
    ) +
    theme_minimal()

### LOOCV yieleded the lowest error value (22.4%) at a K of between 40-44. 10 fold had slightly better (lower) error than 5 fold. But not that different of an error or optimal k range. 
## LOOCV shows that around k=41-43 yielded lowest error value so thats what I'd pick. 

Q4

Q4. Visualize probability versus elevation for the best model.

# Use the optimal k=41 to generate probabilities
best_k <- 41

prob_best <- knn_classify2_probs(
    x = as.matrix(willowtit["elev"]),
    y = willowtit$status,
    x_new = as.matrix(elev_grid),  # Your elevation grid from earlier
    k = best_k
)

# Create dataframe
best_model_probs <- data.frame(
    elev = elev_grid$elev,
    prob_present = prob_best
)

# Add a column for what the actual predictions would be (for coloring)
best_model_probs <- best_model_probs %>%
    mutate(predicted = ifelse(prob_present > 0.5, "present", "absent"))
# now visualize with color coding: 
ggplot(best_model_probs, aes(x = elev, y = prob_present, color = predicted)) +
    geom_point(alpha = 0.7, size = 2.5) +
    geom_hline(yintercept = 0.5, linetype = "dashed", color = "black", linewidth = 1) +
    scale_color_manual(
        values = c("absent" = "coral", "present" = "steelblue"),
        name = "Predicted Status"
    ) +
    labs(
        title = "Best KNN Model: Probability of Bird Presence vs Elevation (k=41)",
        x = "Elevation (m)",
        y = "Probability of Presence"
    ) +
    ylim(0, 1) +
    theme_minimal() +
    theme(legend.position = "right")

Answering questions for Q4:

Bell shaped curve - as elevation increases probability of finding the birds increases, finally crossing the 50% threshold at elevation around 1250m and peaking around 1700m, before decreasing again. Falls below 50% prob at about 2000m elevation, after which increasing elev. decreases probabilty of finding the birds.

Therefore predicted elevation range of presence is about 1250-2000m in elevation.

The probability of 0.5 is the decision boundary - it represents the point where presence and absence are equally likely. When probability < 0.5, it means that among the k=41 nearest neighbors, fewer than half (less than 20.5 out of 41) were labeled “present” in the training data. Therefore, the majority of neighbors vote for “absent”, and that’s what the model predicts.

Q5

Q5. Make a map of where the Willow Tit is predicted to be.

# Load the Swiss elevation data (raster/grid data)
# swissdem <- read.csv("data/swissdem.csv")
swissdem <- read.csv("data/switzerland_tidy.csv")
# Generate real predictions using my best model (k=41)
swiss_predictions <- knn_classify2(
    x = as.matrix(willowtit["elev"]),
    y = willowtit$status,
    x_new = as.matrix(swissdem["Elev_m"]),  # Predict for all Swiss elevations
    k = 41
)

# Add predictions to swissdem
swissdem$pred_status <- swiss_predictions

# Check it worked
table(swissdem$pred_status)
## 
##  absent present 
##   98858   23650
# color scheme is hard to read, try greyscale elev map and magenta for bird presence. also adding a legend:
ggplot() +
    # Elevation in grayscale
    geom_raster(data = swissdem,
                aes(x = x, y = y, fill = Elev_m)) +
    scale_fill_gradient(low = "white", high = "black", name = "Elevation (m)") +
    # Add a dummy layer for the legend
    geom_tile(data = filter(swissdem, pred_status == "present"),
              aes(x = x, y = y, color = "Predicted Presence"),
              fill = "magenta", alpha = 0.6, linewidth = 0) +
    scale_color_manual(
        name = "",
        values = c("Predicted Presence" = "magenta"),
        guide = guide_legend(override.aes = list(fill = "magenta", alpha = 0.6))
    ) +
    coord_quickmap() +
    labs(
        title = "Predicted Distribution of Willow Tit in Switzerland"
    ) +
    theme_void() +
    theme(
        plot.title = element_text(hjust = 0.5, vjust = -2, face = "bold"),
        legend.position = c(0.95, 0.95),  # Top right
        legend.justification = c(1, 1),    # Anchor at top-right of legend box
        legend.background = element_rect(fill = "white", color = "black")
    )

Do the predictions make sense?

Yes they make sense, seeing the magenta (birds probable) around mid elevations!